library(dplyr)
library(tidyr)
library(ggplot2)
theme_set(theme_minimal())
source: https://www.dwd.de/DE/leistungen/klimadatendeutschland/klarchivtagmonat.html
raw_hist <- read.delim('data/produkt_klima_tag_19500101_20221231_00403.txt', sep = ';')
head(raw_hist)
## STATIONS_ID MESS_DATUM QN_3 FX FM QN_4 RSK RSKF SDK SHK_TAG NM VPM
## 1 403 19500101 -999 -999 -999 5 2.2 7 -999 0 5.0 4.0
## 2 403 19500102 -999 -999 -999 5 12.6 8 -999 0 8.0 6.1
## 3 403 19500103 -999 -999 -999 5 0.5 1 -999 0 5.0 6.5
## 4 403 19500104 -999 -999 -999 5 0.5 7 -999 0 7.7 5.2
## 5 403 19500105 -999 -999 -999 5 10.3 7 -999 0 8.0 4.0
## 6 403 19500106 -999 -999 -999 5 7.2 8 -999 12 7.3 5.6
## PM TMK UPM TXK TNK TGK eor
## 1 1025.6 -3.2 83 -1.1 -4.9 -6.3 eor
## 2 1005.6 1.0 95 2.2 -3.7 -5.3 eor
## 3 996.6 2.8 86 3.9 1.7 -1.4 eor
## 4 999.5 -0.1 85 2.1 -0.9 -2.3 eor
## 5 1001.1 -2.8 79 -0.9 -3.3 -5.2 eor
## 6 997.5 2.6 79 5.0 -4.0 -4.0 eor
raw_pres <- read.delim('data/produkt_klima_tag_20221107_20240509_00403.txt', sep = ';')
head(raw_pres)
## STATIONS_ID MESS_DATUM QN_3 FX FM QN_4 RSK RSKF SDK SHK_TAG NM VPM
## 1 403 20221107 -999 -999 -999 10 0.0 6 4.5 0 6.2 9.6
## 2 403 20221108 -999 -999 -999 10 0.2 6 7.5 0 6.0 10.4
## 3 403 20221109 -999 -999 -999 10 1.0 6 3.7 0 6.6 11.4
## 4 403 20221110 -999 -999 -999 10 0.0 0 6.1 0 5.1 10.2
## 5 403 20221111 -999 -999 -999 10 0.0 0 1.9 0 6.3 9.6
## 6 403 20221112 -999 -999 -999 10 0.0 0 7.3 0 4.0 8.8
## PM TMK UPM TXK TNK TGK eor
## 1 1002.9 10.7 75 15.0 6.4 5.1 eor
## 2 1002.7 12.1 75 16.9 7.9 4.2 eor
## 3 1001.5 11.8 83 15.0 9.0 5.1 eor
## 4 1012.6 11.7 74 14.3 8.6 5.8 eor
## 5 1020.1 8.6 87 12.8 4.0 0.6 eor
## 6 1022.8 6.4 92 13.8 1.8 -0.9 eor
meas <- bind_rows(raw_hist, raw_pres) |>
select(date = MESS_DATUM, temp = TMK) |>
mutate(date = as.POSIXct(strptime(date, "%Y%m%d")),
year = as.integer(as.numeric(format(date, "%Y"))),
day = as.integer(as.numeric(format(date, "%j")))) |>
distinct(date, .keep_all = TRUE)
rm(raw_hist, raw_pres)
stopifnot(all(count(meas, date)$n == 1))
head(meas)
## date temp year day
## 1 1950-01-01 -3.2 1950 1
## 2 1950-01-02 1.0 1950 2
## 3 1950-01-03 2.8 1950 3
## 4 1950-01-04 -0.1 1950 4
## 5 1950-01-05 -2.8 1950 5
## 6 1950-01-06 2.6 1950 6
ggplot(meas, aes(date, temp)) +
geom_line() +
geom_smooth(method = "gam")
filter(meas, year >= 2018) |>
ggplot(aes(date, temp)) +
geom_line()
filter(meas, year == 2023) |>
ggplot(aes(date, temp)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(meas, aes(day, temp, color = year)) +
geom_point(alpha = 0.25) +
scale_color_binned(type = 'viridis')
m1 <- lm(temp ~ cos(2 * pi * day/366) + sin(2 * pi * day/366), meas)
summary(m1)
##
## Call:
## lm(formula = temp ~ cos(2 * pi * day/366) + sin(2 * pi * day/366),
## data = meas)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.3493 -2.5635 -0.0229 2.5823 14.0507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.41121 0.02290 410.93 <2e-16 ***
## cos(2 * pi * day/366) -9.00711 0.03244 -277.66 <2e-16 ***
## sin(2 * pi * day/366) -2.55724 0.03234 -79.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.774 on 27155 degrees of freedom
## Multiple R-squared: 0.7544, Adjusted R-squared: 0.7544
## F-statistic: 4.17e+04 on 2 and 27155 DF, p-value: < 2.2e-16
plot(m1)
meas_fit <- cbind(meas, pred = fitted(m1))
filter(meas_fit, year >= 2018) |>
ggplot() +
geom_line(aes(date, temp), alpha = 0.25) +
geom_line(aes(date, pred), color = 'red')
ggplot(meas_fit) +
geom_line(aes(date, temp), alpha = 0.25) +
geom_line(aes(date, pred), color = 'red')
filter(meas_fit, year %in% (1950 + 0:6 * 10)) |>
ggplot(aes(day, temp, color = year)) +
geom_point(alpha = 0.25) +
geom_line(aes(day, pred), color = 'red') +
scale_color_binned(type = 'viridis')
m2 <- lm(temp ~ year + cos(2 * pi * day/366) + sin(2 * pi * day/366), meas)
summary(m2)
##
## Call:
## lm(formula = temp ~ year + cos(2 * pi * day/366) + sin(2 * pi *
## day/366), data = meas)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0982 -2.5367 -0.0541 2.5753 13.0459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -47.296595 2.091820 -22.61 <2e-16 ***
## year 0.028544 0.001053 27.11 <2e-16 ***
## cos(2 * pi * day/366) -9.010653 0.032010 -281.50 <2e-16 ***
## sin(2 * pi * day/366) -2.564623 0.031911 -80.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.724 on 27154 degrees of freedom
## Multiple R-squared: 0.7609, Adjusted R-squared: 0.7608
## F-statistic: 2.88e+04 on 3 and 27154 DF, p-value: < 2.2e-16
plot(m2)
meas_fit2 <- cbind(meas, pred = fitted(m2))
filter(meas_fit2, year >= 2018) |>
ggplot() +
geom_line(aes(date, temp), alpha = 0.25) +
geom_line(aes(date, pred), color = 'red')
ggplot(meas_fit2) +
geom_line(aes(date, temp), alpha = 0.25) +
geom_line(aes(date, pred), color = 'red')
filter(meas_fit2, year %in% (1950 + 0:6 * 10)) |>
ggplot(aes(day, temp, color = year)) +
geom_point(alpha = 0.25) +
geom_line(aes(day, pred, color = year)) +
scale_color_binned(type = 'viridis')
resid <- meas_fit2$temp - meas_fit2$pred
ggplot(data.frame(resid = resid), aes(resid)) +
geom_histogram(bins = 20)
quantile(resid, c(0.05, 0.95))
## 5% 95%
## -5.907213 6.098858
resid_stats <- group_by(meas_fit2, year) |>
summarise(me = mean(temp - pred),
mae = mean(abs(temp - pred)),
prop_days_warmer = mean(temp > pred + 6),
prop_days_colder = mean(temp < pred - 6))
#rmse = sqrt(mean((temp - pred)^2)))
resid_stats
## # A tibble: 75 × 5
## year me mae prop_days_warmer prop_days_colder
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1950 0.942 2.81 0.0932 0.0329
## 2 1951 1.26 2.83 0.0493 0
## 3 1952 0.00242 2.81 0.0492 0.0246
## 4 1953 1.56 3.04 0.112 0.0137
## 5 1954 -0.258 3.05 0.0493 0.0658
## 6 1955 -0.321 2.96 0.0384 0.0603
## 7 1956 -0.977 3.48 0.0328 0.104
## 8 1957 0.677 3.03 0.0904 0.0301
## 9 1958 0.169 2.48 0.0219 0.0164
## 10 1959 1.04 2.85 0.0822 0.0164
## # ℹ 65 more rows
resid_stats_plt <- pivot_longer(resid_stats, !year, names_to = "measure")
filter(resid_stats_plt, measure %in% c("mae", "me")) |>
ggplot(aes(x = year, y = value, fill = measure)) +
geom_col(position = position_dodge())
filter(resid_stats_plt, measure %in% c("prop_days_warmer", "prop_days_colder")) |>
ggplot(aes(x = year, y = value, fill = measure)) +
geom_col(position = position_stack()) +
scale_fill_discrete(limits = rev)
resid_stats_ordered <- filter(resid_stats, year < 2024) |>
arrange(mae)
resid_stats_ordered |> head(1)
## # A tibble: 1 × 5
## year me mae prop_days_warmer prop_days_colder
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 2017 -0.188 2.35 0.0274 0.0137
resid_stats_ordered |> tail(1)
## # A tibble: 1 × 5
## year me mae prop_days_warmer prop_days_colder
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1985 -1.20 3.55 0.0493 0.145
least_deviation_yr <- resid_stats_ordered |> head(1) |> pull(year)
most_deviation_yr <- resid_stats_ordered |> tail(1) |> pull(year)
least_most_plt <- data.frame(year = c(least_deviation_yr, most_deviation_yr), label = c("least deviation", "most deviation")) |>
inner_join(meas_fit2, by = 'year') |>
mutate(label = paste0(year, " (", label, ")"))
ggplot(least_most_plt, aes(day, temp, color = label)) +
geom_point(alpha = 0.25) +
geom_line(aes(day, pred))
ggplot(least_most_plt, aes(day, temp - pred, color = label)) +
geom_point(alpha = 0.25) +
geom_hline(yintercept = 0, linetype = "dashed")